In [7]:

    
library(MASS)



In [8]:

    
# install.packages('/notebooks/ISLR_1.0.tar.gz')



In [9]:

    
library(ISLR) # library from the book

simple linear regression



In [10]:

    
names(Boston) # housing values in suburbs of Boston; part of MASS library









    Out[10]:





	'crim'
	'zn'
	'indus'
	'chas'
	'nox'
	'rm'
	'age'
	'dis'
	'rad'
	'tax'
	'ptratio'
	'black'
	'lstat'
	'medv'



In [11]:

    
?Boston









    Out[11]:





Boston {MASS} R Documentation


Housing Values in Suburbs of Boston


Description

The Boston data frame has 506 rows and 14 columns.



Usage

Boston



Format

This data frame contains the following columns:



crim

per capita crime rate by town.


zn

proportion of residential land zoned for lots over 25,000 sq.ft.


indus

proportion of non-retail business acres per town.


chas

Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).


nox

nitrogen oxides concentration (parts per 10 million).


rm

average number of rooms per dwelling.


age

proportion of owner-occupied units built prior to 1940.


dis

weighted mean of distances to five Boston employment centres.


rad

index of accessibility to radial highways.


tax

full-value property-tax rate per \$10,000.


ptratio

pupil-teacher ratio by town.


black

1000(Bk - 0.63)^2 where Bk is the proportion of blacks
by town.


lstat

lower status of the population (percent).


medv

median value of owner-occupied homes in \$1000s.






Source

Harrison, D. and Rubinfeld, D.L. (1978)
Hedonic prices and the demand for clean air.
J. Environ. Economics and Management
5, 81–102.

Belsley D.A., Kuh, E.  and Welsch, R.E. (1980)
Regression Diagnostics. Identifying Influential Data and Sources
of Collinearity.
New York: Wiley.


[Package MASS version 7.3-44 ]



In [12]:

    
# lstat: lower status of the population (percent)
# medv: median value of owner-occupied homes in $1000s

par(mfrow = c(1,2))
plot(Boston$lstat, Boston$medv)
# different notation, same result: plots Y-axis~X-axis from Boston dataframe
plot(medv~lstat, Boston)



In [13]:

    
# create a linear model about the same variables
# reads as: response medv is modelled as (the single predictor) lstat
fit1=lm(medv~lstat, data=Boston)



In [14]:

    
fit1 # there's a negative correlation









    Out[14]:





Call:
lm(formula = medv ~ lstat, data = Boston)

Coefficients:
(Intercept)        lstat  
      34.55        -0.95



In [15]:

    
# as the % of 'lower status' inhabitants increases,
# the median housing value decreases 
summary(fit1) # very significant results









    Out[15]:





Call:
lm(formula = medv ~ lstat, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
lstat       -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,	Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16



In [16]:

    
plot(medv~lstat, Boston)
abline(fit1, col="red") # abline: add straight lines to a plot



In [17]:

    
names(fit1)









    Out[17]:





	'coefficients'
	'residuals'
	'effects'
	'rank'
	'fitted.values'
	'assign'
	'qr'
	'df.residual'
	'xlevels'
	'call'
	'terms'
	'model'



In [18]:

    
# 95% confidence intervals for the coefficents of the fit
confint(fit1)









    Out[18]:





2.5 % 97.5 %

	(Intercept) 33.44846 35.65922
	lstat -1.0261482 -0.8739505



In [19]:

    
# predict() can be used to query a model fit
# here: predict medv (and conf intervals)  given 3 new values for lstat
predict(fit1, data.frame(lstat=c(5,10,15)), interval="confidence")
# returns the fit at the 3 vals, their lower/upper confidence intervals









    Out[19]:





fit lwr upr

	1 29.80359 29.00741 30.59978
	2 25.05335 24.47413 25.63256
	3 20.30310 19.73159 20.87461



In [20]:

    
# plot(fit1) # gives lots of plots / stat summaries

multiple linear regression



In [21]:

    
fit2=lm(medv~lstat+age, data=Boston)
summary(fit2)
# age is also a significant predictor for medv,
# but not as strong as lstat

# multiple R-squared (% of variance explained; higher is better)









    Out[21]:





Call:
lm(formula = medv ~ lstat + age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.981  -3.978  -1.283   1.968  23.158 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
age          0.03454    0.01223   2.826  0.00491 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.173 on 503 degrees of freedom
Multiple R-squared:  0.5513,	Adjusted R-squared:  0.5495 
F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16



In [22]:

    
# use all variables (except for medv) to predict medv
fit3=lm(medv~.,Boston)
summary(fit3)
# age: was significant, when it was in the model w/ lstat
# but w/ all other predictors, it is no longer significant









    Out[22]:





Call:
lm(formula = medv ~ ., data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.595  -2.730  -0.518   1.777  26.199 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
zn           4.642e-02  1.373e-02   3.382 0.000778 ***
indus        2.056e-02  6.150e-02   0.334 0.738288    
chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
age          6.922e-04  1.321e-02   0.052 0.958229    
dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
black        9.312e-03  2.686e-03   3.467 0.000573 ***
lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.745 on 492 degrees of freedom
Multiple R-squared:  0.7406,	Adjusted R-squared:  0.7338 
F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16



In [23]:

    
par(mfrow = c(2,2))
plot(fit3)
# residuals vs. fitted values: we're looking for non-linearity
# we see here, that the model doesn't capture everything that is
# going on



In [24]:

    
# update(): update a model
# same response, use all predictors from fit3 except for age and indus
fit4=update(fit3, ~.-age-indus)
# we're keeping only very significant predictors



In [25]:

    
par(mfrow = c(2,2))
plot(fit4)



In [26]:

    
summary(fit4)









    Out[26]:





Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
    tax + ptratio + black + lstat, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.5984  -2.7386  -0.5046   1.7273  26.2373 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
crim         -0.108413   0.032779  -3.307 0.001010 ** 
zn            0.045845   0.013523   3.390 0.000754 ***
chas          2.718716   0.854240   3.183 0.001551 ** 
nox         -17.376023   3.535243  -4.915 1.21e-06 ***
rm            3.801579   0.406316   9.356  < 2e-16 ***
dis          -1.492711   0.185731  -8.037 6.84e-15 ***
rad           0.299608   0.063402   4.726 3.00e-06 ***
tax          -0.011778   0.003372  -3.493 0.000521 ***
ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
black         0.009291   0.002674   3.475 0.000557 ***
lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.736 on 494 degrees of freedom
Multiple R-squared:  0.7406,	Adjusted R-squared:  0.7348 
F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

nonlinear terms and interactions



In [27]:

    
# linear model with an interaction between lstat and age
fit5 = lm(medv~lstat*age, Boston)
# there's a somewhat significant interaction betw. lstat and age
# (although age in itself is not significant)



In [28]:

    
summary(fit5)









    Out[28]:





Call:
lm(formula = medv ~ lstat * age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.806  -4.045  -1.333   2.085  27.552 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
age         -0.0007209  0.0198792  -0.036   0.9711    
lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.149 on 502 degrees of freedom
Multiple R-squared:  0.5557,	Adjusted R-squared:  0.5531 
F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16



In [29]:

    
# we use a quadratic term `lstat^2` here,
# but `^` means sth. in this formula language, so we'll
# escape it using the identity function I()

fit6 = lm(medv~lstat +I(lstat^2), Boston)



In [30]:

    
summary(fit6)









    Out[30]:





Call:
lm(formula = medv ~ lstat + I(lstat^2), data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.2834  -3.8313  -0.5295   2.3095  25.4148 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 42.862007   0.872084   49.15   <2e-16 ***
lstat       -2.332821   0.123803  -18.84   <2e-16 ***
I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.524 on 503 degrees of freedom
Multiple R-squared:  0.6407,	Adjusted R-squared:  0.6393 
F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16



In [31]:

    
attach(Boston) # put Boston vars in local namespace



In [32]:

    
plot(medv~lstat)
# add quadratic fit to plot,
# i.e. for each val from lstat print the fitted val of the model.
# we can't use abline(), b/c this isn't linear
points(lstat, fitted(fit6), col="red", pch=20)



In [33]:

    
plot(medv~lstat)
# fit medv as a polynomial of degree 4 in lstat
fit7 = lm(medv~poly(lstat, 4))
points(lstat, fitted(fit7), col="blue", pch=20)
# risk of overfitting the data

available plotting characters



In [34]:

    
# cex: how much should the plotting symbol be magnified
# e.g. cex.axis=2 increases axis labels
plot(1:20, 1:20, pch=1:20, cex=2)

Qualitative predictors



In [40]:

    
# ?Carseats # simul. data set containing sales of child car seats at 400 different stores. 
names(Carseats) # TODO: download other library









    Out[40]:





	'Sales'
	'CompPrice'
	'Income'
	'Advertising'
	'Population'
	'Price'
	'ShelveLoc'
	'Age'
	'Education'
	'Urban'
	'US'



In [38]:

    
summary(Carseats)









    Out[38]:





     Sales          CompPrice       Income        Advertising    
 Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
 1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
 Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
 Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
 3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
 Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
   Population        Price        ShelveLoc        Age          Education   
 Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
 1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
 Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
 Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
 3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
 Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
 Urban       US     
 No :118   No :142  
 Yes:282   Yes:258



In [44]:

    
# use all predictors from the frame (except for Sales) to predict Sales
# add interactions betw. Income and Advertising, as well as Age and Price
fit1 = lm(Sales~.+Income:Advertising+Age:Price, Carseats)



In [46]:

    
summary(fit1)
# signif. interaction betw. Income and Advertising
# Age:Price interaction isn't signif.









    Out[46]:





Call:
lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9208 -0.7503  0.0177  0.6754  3.3413 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
Income              0.0108940  0.0026044   4.183 3.57e-05 ***
Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
Population          0.0001592  0.0003679   0.433 0.665330    
Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
Age                -0.0579466  0.0159506  -3.633 0.000318 ***
Education          -0.0208525  0.0196131  -1.063 0.288361    
UrbanYes            0.1401597  0.1124019   1.247 0.213171    
USYes              -0.1575571  0.1489234  -1.058 0.290729    
Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
Price:Age           0.0001068  0.0001333   0.801 0.423812    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.011 on 386 degrees of freedom
Multiple R-squared:  0.8761,	Adjusted R-squared:  0.8719 
F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16



In [50]:

    
#?contrasts # Set and view the contrasts associated with a factor.



In [51]:

    
contrasts(Carseats$ShelveLoc)
# shows how R will encode that variable in a linear model
# ShelveLoc: a 3-level factor









    Out[51]:





Good Medium

	Bad 0 0
	Good 1 0
	Medium 0 1

write R functions

fit a regression model and make a plot



In [106]:

    
regplot <- function(x,y) {
    fit = lm(y~x)
    plot(x,y)
    abline(fit, col="red")
    summary(fit)
}



In [105]:

    
regplot(Carseats$Price, Carseats$Sales)









    Out[105]:





Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
   13.64192     -0.05307



In [109]:

    
# ... parameter is like kwargs in Python

regplot <- function(x,y, ...) {
    fit = lm(y~x)
    plot(x,y, ...)
    abline(fit, col="red")
    summary(fit)
    return(fit)
}



In [108]:

    
regplot(Carseats$Price, Carseats$Sales, xlab="Price", ylab="Sales", col="blue", pch=20)









    Out[108]:





Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
   13.64192     -0.05307



In [93]:

    
carfit = lm(Price ~ Sales, data = Carseats)
summary(carfit)









    Out[93]:





Call:
lm(formula = Price ~ Sales, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-80.851 -15.332   0.528  13.315  57.791 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 143.7589     3.0143  47.692   <2e-16 ***
Sales        -3.7304     0.3763  -9.912   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 21.23 on 398 degrees of freedom
Multiple R-squared:  0.198,	Adjusted R-squared:  0.196 
F-statistic: 98.25 on 1 and 398 DF,  p-value: < 2.2e-16



In [91]:

    
testfit = regplot(Carseats$Price, Carseats$Sales)
testfit$call









    Out[91]:





lm(formula = y ~ x)



In [95]:

    
colnames(Carseats)# Carseats$Price









    Out[95]:





	'Sales'
	'CompPrice'
	'Income'
	'Advertising'
	'Population'
	'Price'
	'ShelveLoc'
	'Age'
	'Education'
	'Urban'
	'US'



In [97]:

    
x = "Price"
index = which(colnames(Carseats) == x)
colnames(Carseats)[index]









    Out[97]:




'Price'



In [89]:

    
data(iris)
class(iris)

irisfit = lm(Sepal.Length ~ Sepal.Width, data = iris)
#summary(irisfit)
irisfit$coefficients[2]









    Out[89]:




'data.frame'






    Out[89]:




Sepal.Width: -0.2233610611299

	fit	lwr	upr
1	29.80359	29.00741	30.59978
2	25.05335	24.47413	25.63256
3	20.30310	19.73159	20.87461